library(rmarkdown)
library(tidyverse)
library(tidymodels)
library(ggplot2)
library(GGally)
library(plotly)
library(ggcorrplot)
library(ggfortify)
library(skimr)
library(caret)
library(factoextra)
library(reactable)
library(glmnet)
library(kknn)
library(ranger)
library(vip)
library(ROCR)
library(pROC)
data <- read.csv(file = 'audit_risk.csv')
skim(data)
| Name | data |
| Number of rows | 776 |
| Number of columns | 27 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 26 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| LOCATION_ID | 0 | 1 | 1 | 7 | 0 | 45 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Sector_score | 0 | 1 | 20.18 | 24.32 | 1.85 | 2.37 | 3.89 | 55.57 | 59.85 | ▇▁▁▁▃ |
| PARA_A | 0 | 1 | 2.45 | 5.68 | 0.00 | 0.21 | 0.88 | 2.48 | 85.00 | ▇▁▁▁▁ |
| Score_A | 0 | 1 | 0.35 | 0.17 | 0.20 | 0.20 | 0.20 | 0.60 | 0.60 | ▇▁▃▁▅ |
| Risk_A | 0 | 1 | 1.35 | 3.44 | 0.00 | 0.04 | 0.17 | 1.49 | 51.00 | ▇▁▁▁▁ |
| PARA_B | 0 | 1 | 10.80 | 50.08 | 0.00 | 0.00 | 0.41 | 4.16 | 1264.63 | ▇▁▁▁▁ |
| Score_B | 0 | 1 | 0.31 | 0.17 | 0.20 | 0.20 | 0.20 | 0.40 | 0.60 | ▇▁▁▁▃ |
| Risk_B | 0 | 1 | 6.33 | 30.07 | 0.00 | 0.00 | 0.08 | 1.84 | 758.78 | ▇▁▁▁▁ |
| TOTAL | 0 | 1 | 13.22 | 51.31 | 0.00 | 0.54 | 1.37 | 7.71 | 1268.91 | ▇▁▁▁▁ |
| numbers | 0 | 1 | 5.07 | 0.26 | 5.00 | 5.00 | 5.00 | 5.00 | 9.00 | ▇▁▁▁▁ |
| Score_B.1 | 0 | 1 | 0.22 | 0.08 | 0.20 | 0.20 | 0.20 | 0.20 | 0.60 | ▇▁▁▁▁ |
| Risk_C | 0 | 1 | 1.15 | 0.54 | 1.00 | 1.00 | 1.00 | 1.00 | 5.40 | ▇▁▁▁▁ |
| Money_Value | 1 | 1 | 14.14 | 66.61 | 0.00 | 0.00 | 0.09 | 5.60 | 935.03 | ▇▁▁▁▁ |
| Score_MV | 0 | 1 | 0.29 | 0.16 | 0.20 | 0.20 | 0.20 | 0.40 | 0.60 | ▇▁▁▁▂ |
| Risk_D | 0 | 1 | 8.27 | 39.97 | 0.00 | 0.00 | 0.02 | 2.24 | 561.02 | ▇▁▁▁▁ |
| District_Loss | 0 | 1 | 2.51 | 1.23 | 2.00 | 2.00 | 2.00 | 2.00 | 6.00 | ▇▁▁▁▁ |
| PROB | 0 | 1 | 0.21 | 0.04 | 0.20 | 0.20 | 0.20 | 0.20 | 0.60 | ▇▁▁▁▁ |
| RiSk_E | 0 | 1 | 0.52 | 0.29 | 0.40 | 0.40 | 0.40 | 0.40 | 2.40 | ▇▁▁▁▁ |
| History | 0 | 1 | 0.10 | 0.53 | 0.00 | 0.00 | 0.00 | 0.00 | 9.00 | ▇▁▁▁▁ |
| Prob | 0 | 1 | 0.21 | 0.04 | 0.20 | 0.20 | 0.20 | 0.20 | 0.60 | ▇▁▁▁▁ |
| Risk_F | 0 | 1 | 0.05 | 0.31 | 0.00 | 0.00 | 0.00 | 0.00 | 5.40 | ▇▁▁▁▁ |
| Score | 0 | 1 | 2.70 | 0.86 | 2.00 | 2.00 | 2.40 | 3.25 | 5.20 | ▇▁▁▂▁ |
| Inherent_Risk | 0 | 1 | 17.68 | 54.74 | 1.40 | 1.58 | 2.21 | 10.66 | 801.26 | ▇▁▁▁▁ |
| CONTROL_RISK | 0 | 1 | 0.57 | 0.44 | 0.40 | 0.40 | 0.40 | 0.40 | 5.80 | ▇▁▁▁▁ |
| Detection_Risk | 0 | 1 | 0.50 | 0.00 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | ▁▁▇▁▁ |
| Audit_Risk | 0 | 1 | 7.17 | 38.67 | 0.28 | 0.32 | 0.56 | 3.25 | 961.51 | ▇▁▁▁▁ |
| Risk | 0 | 1 | 0.39 | 0.49 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
df <- na.omit(data)
df <- sapply(df, as.numeric)
df <- na.omit(df)
#Excluir variáveis cuja variância vale 0 e normalizar df
df <- as.data.frame(scale
(as.data.frame(df[, apply(df,2,var) != 0])))
c <- df %>%
group_by(Risk) %>%
summarise_all(.funs = list(M=mean,ST=sd))
c <- round(c,2)
c <- as.data.frame(t(c))
colnames(c) <- c("Risk0","Risk1")
c <- na.omit(c)
names <- colnames(df[1:25])
cc<- tibble(Variavel=names[1:25],MEDIA0=c$Risk0[2:26],
SD0=c$Risk0[27:51],
MEDIA1=c$Risk1[2:26],SD1=c$Risk1[27:51])
fig <- plot_ly(data=cc, x = ~Variavel,
y = ~MEDIA0, type = 'bar',name="MEDIA0")
fig <- fig %>% add_trace(y = ~cc$SD0, name = 'SD0')
fig <- fig %>% add_trace(y = ~cc$MEDIA1, name = 'MEDIA1')
fig <- fig %>% add_trace(y = ~cc$SD1, name = 'SD1')
fig <- fig %>% layout(barmode = 'group')
fig <- fig %>% layout(title = "Media e desvio padraão por grupo de Risk (0 ou 1)"
,yaxis = list(title=""))
fig
Analisando o PCA é possível determinar o melhor número de variáveis ou corte para ser utilizado.
df_pca <- df %>% select(-'Risk')
res_pca <- prcomp(df_pca)
fviz_eig(res_pca)
s <- summary(res_pca)
s <- as.data.frame(round(s$importance,3))
reactable(s,highlight = TRUE,resizable = TRUE,compact = TRUE)
c <- df %>%
cor() %>%
ggcorrplot(lab = TRUE, lab_size = 2, tl.cex = 10, type = "upper",colors = c("red", "white", "blue"),show.diag = TRUE,digits = 1)
ggplotly(c)
corr <- function(x){cor(x,df$Risk)}
corre <- map_dbl(df %>% select(-'Risk'),corr)
corre <- as.data.frame(corre)
fig <- plot_ly(
x = row.names(corre),
y = corre$corre,
text = row.names(corre),
name = "Correlacao com Risco",
type = 'bar', orientation = 'v',
color = ~corre$corre,colors = c("red","white","blue"))
fig <- fig %>% layout(title = "Correlação com Risk",yaxis = list(title="Corrleção"))
fig
library(doParallel)
all_cores <- parallel::detectCores(logical = TRUE)-2
cl <- makeCluster(all_cores)
registerDoParallel(cl)
df <- as.data.frame(sapply(data, as.numeric)) %>%
mutate(Risk=factor(Risk))
split <- initial_split(df, prop = 0.7, strata = "Risk")
treinamento <- training(split)
teste <- testing(split)
cv_split <- vfold_cv(treinamento, v = 10, strata = "Risk")
receita <- recipe(Risk ~ ., treinamento) %>%
step_naomit(all_numeric()) %>%
step_pca(all_numeric(), -all_outcomes() , num_comp = 15) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric())
receita_prep <- prep(receita)
treinamento_proc <- juice(receita_prep)
teste_proc <- bake(receita_prep, new_data = teste)
cv_split <- vfold_cv(treinamento, v = 10, strata = "Risk")
lasso <- logistic_reg(penalty = tune()) %>%
set_engine("glmnet") %>%
set_mode("classification")
lambda_tune <- tune_grid(lasso,
receita,
resamples = cv_split,
metrics = metric_set(roc_auc, accuracy),
grid = 50)
autoplot(lambda_tune)
best <- select_best(lambda_tune, "accuracy")
lasso <- finalize_model(lasso, parameters = best)
fit_lasso <- fit(lasso, Risk ~ ., data = treinamento_proc)
pred_lasso <- predict(fit_lasso,teste_proc)
rf <- rand_forest(mtry=tune(),trees=tune()) %>%
set_engine("ranger",importance = "permutation") %>%
set_mode("classification") %>%
translate()
rf_tune <- tune_grid(rf,
receita,
resamples = cv_split,
metrics = metric_set(roc_auc, accuracy),
grid = 10)
autoplot(rf_tune)
best <- select_best(rf_tune, "roc_auc")
rf <- finalize_model(rf, parameters = best)
fit_rf <- fit(rf, Risk ~ ., data = treinamento_proc)
pred_rf<- predict(fit_rf,teste_proc)
knn <- nearest_neighbor(neighbors = tune()) %>%
set_engine("kknn") %>%
set_mode("classification")
knn_tune <- tune_grid(knn,
receita,
resamples = cv_split,
metrics = metric_set(roc_auc, accuracy),
grid = 50)
autoplot(knn_tune)
best <- select_best(knn_tune, "roc_auc")
knn <- finalize_model(knn, parameters = best)
fit_knn <- fit(knn, Risk ~ ., data = treinamento_proc)
pred_knn <- predict(fit_knn,teste_proc)
stopCluster(cl)
test_train <- data.frame(Teste=as.numeric(teste_proc$Risk),
LASSO=as.numeric(pred_lasso$.pred_class),
RF=as.numeric(pred_rf$.pred_class),
KNN=as.numeric(pred_knn$.pred_class))
modelos <- colnames(test_train[,2:length(test_train)])
rocs <- roc(Teste ~ ., data = test_train)
ggplotly(ggroc(rocs))
EQMF <- function(x,y) mean((x-y)^2)
i <- seq(2,ncol(test_train))
EQM <-sapply(i,function(x) EQMF(test_train[,x],
test_train$Teste))
fig <- plot_ly(x=~reorder(modelos,EQM),y=EQM,type="bar",name="EQM")
fig <- fig %>% layout(title = "EQM x Modelo",xaxis = list(title="Modelo"),
yaxis = list(title="EQM"))
fig